## Warning in rm(prosperLoans): object 'prosperLoans' not found
Some data has been parsed incorrectly: i.e. dates as factors, prosper scores as integers (instead of factors), etc. We should clean this up so that the data is easier to work with.
# Convert prosper rating to a factor
prosperLoans$ProsperRating..numeric. <- factor(prosperLoans$ProsperRating..numeric.)
# Convert prosper score to a factor
prosperLoans$ProsperScore <- factor(prosperLoans$ProsperScore)
# Convert Listing Category to a factor
prosperLoans$ListingCategory..numeric. <- factor(prosperLoans$ListingCategory..numeric.)
# Convert Loan Origination Date from factor to a date
prosperLoans$LoanOriginationDate <- as.Date(prosperLoans$LoanOriginationDate, format="%Y-%m-%d")
# Convert Listing Creation Date from factor to a date
prosperLoans$ListingCreationDate <- as.Date(prosperLoans$ListingCreationDate, format="%Y-%m-%d")
# Convert Closed Date from factor to a date
prosperLoans$ClosedDate <- as.Date(prosperLoans$ClosedDate, format="%Y-%m-%d")
# Convert Term column to a factor
# Term column is an integer. We should convert this to a factor (12/36/60 months)
prosperLoans$Term <- factor(prosperLoans$Term)
# Reorder credit grade levels so that AA appears first instead of second
prosperLoans$CreditGrade <- relevel(prosperLoans$CreditGrade, "AA")
# Convert origination quarter to ordered factor
prosperLoans$LoanOriginationQuarter <- ordered(prosperLoans$LoanOriginationQuarter, levels = c("Q4 2005","Q1 2006","Q2 2006","Q3 2006","Q4 2006","Q1 2007","Q2 2007","Q3 2007","Q4 2007","Q1 2008","Q2 2008","Q3 2008","Q4 2008","Q1 2009","Q2 2009","Q3 2009","Q4 2009","Q1 2010","Q2 2010","Q3 2010","Q4 2010","Q1 2011","Q2 2011","Q3 2011","Q4 2011","Q1 2012","Q2 2012","Q3 2012","Q4 2012","Q1 2013","Q2 2013","Q3 2013","Q4 2013","Q1 2014","Q2 2014","Q3 2014","Q4 2014"))
# Estimated Loss (Rate) is a very precise number. For purposes of "buckets" in our plots, let's round to 2- and 3- digits.
prosperLoans$EstimatedLossRounded2 <- round(prosperLoans$EstimatedLoss, 2)
prosperLoans$EstimatedLossRounded3 <- round(prosperLoans$EstimatedLoss, 3)
# Round Borrower APR to 2-digits so we can use it in buckets.
prosperLoans$BorrowerAPRRounded2 = round(prosperLoans$BorrowerAPR, 2)
# Calculate maturity date from the start date and term of loan.
get_maturity_date <- function(start_date, term) {
term_as_number = as.numeric(as.character(prosperLoans$Term))
month(start_date) <- month(start_date) + term_as_number
return(start_date)
}
# Function to 'merge' and map pre- and post- 2009 credit rating fields
generateUdacityRating <- function(creditGrade, prosperRatingAlpha) {
tempRating = ""
tempRating = ifelse(creditGrade == "", prosperRatingAlpha , creditGrade)
tempRating = ifelse(tempRating == "NC", "HR", tempRating)
tempRatingAsNumber = ifelse(tempRating == 'AA', 7, ifelse(tempRating == 'A',6, ifelse(tempRating == 'B', 5,
ifelse(tempRating == 'C', 4, ifelse(tempRating == 'D', 3, ifelse(tempRating == 'E', 2, 1))))))
return(tempRatingAsNumber)
}
# Return the numeric value of a factor (as long as it is a number)
getNumberFromFactor <- function(numericFactor) {
return(as.numeric(as.character(numericFactor)))
}
# To distinguish custom columns from the original dataset columns, we will preface added column names with "Udacity"
# Is the loan in default (customer unable to pay)
prosperLoans$UdacityInDefault = ifelse(prosperLoans$LoanStatus == "Chargedoff" | prosperLoans$LoanStatus == "Defaulted" , 1 , 0)
prosperLoans$UdacityInDefaultFactor = factor(prosperLoans$UdacityInDefault)
# For defaults, what percentage of loan principal remains unpaid?
prosperLoans$UdacityDefaultPrincipalUnpaidPercent <- ifelse(prosperLoans$UdacityInDefault == 1, (1 - prosperLoans$LP_CustomerPrincipalPayments / prosperLoans$LoanOriginalAmount),0)
# Prosper used different fields to rate borrower credit. Pre-2009 = CreditGrade, Post-2009 = ProsperRating.
# This field merges the two fields taking the newer field if available.
prosperLoans$UdacityRating = generateUdacityRating(as.character(prosperLoans$CreditGrade), as.character(prosperLoans$ProsperRating..Alpha.))
prosperLoans$UdacityRating <- factor(prosperLoans$UdacityRating)
prosperLoans$UdacityRatingAsNumber <- getNumberFromFactor(prosperLoans$UdacityRating)
# How much does Prosper predict investors will lose on their principal on this category of loans (based on forecast
# loss rate and principal)?
prosperLoans$UdacityEstimatedLossAmount = ifelse(is.na(prosperLoans$EstimatedLoss),0,prosperLoans$EstimatedLoss * prosperLoans$LoanOriginalAmount)
prosperLoans$UdacityMaturityDate <- get_maturity_date(prosperLoans$LoanOriginationDate, prosperLoans$Term)
# For paid off loans, was the loan paid ahead of schedule?
# This is a "problem" for investors who are expecting a steady stream of income
prosperLoans$UdacityIsPrepaid <- ifelse(prosperLoans$LoanStatus == "Completed" & prosperLoans$ClosedDate < prosperLoans$UdacityMaturityDate,1,0)
# Prosper has numerous statuses that are difficult to visualize on a chart. Let's add an alternate status field for charting.
# ClosedPrepaid: Completed before maturity
# ClosedPositive: Cancelled, Completed
# ClosedNegative: Defaulted, Chargedoff
# OpenPositive: Current, FinalPaymentInProgress
# OpenNegative: PastDue*
prosperLoans$UdacityLoanStatusVisual <- ifelse(prosperLoans$UdacityIsPrepaid == 1,"ClosedPrepaid", ifelse( prosperLoans$LoanStatus=="Cancelled" | prosperLoans$LoanStatus=="Completed", "ClosedPositive", ifelse(prosperLoans$UdacityInDefault == 1, "ClosedNegative", ifelse(prosperLoans$LoanStatus == "Current" | prosperLoans$LoanStatus == "FinalPaymentInProgress", "OpenPositive", "OpenNegative"))))
prosperLoans$UdacityLoanStatusVisual <- factor(prosperLoans$UdacityLoanStatusVisual)
# Create separate variable to hold defaulted loans
prosperDefaultedLoans = prosperLoans[prosperLoans$UdacityInDefault == 1,]
cat("Total # Rows: " , nrow(prosperLoans))
## Total # Rows: 113937
summary(prosperLoans$LoanOriginationDate)
## Min. 1st Qu. Median Mean 3rd Qu.
## "2005-11-15" "2008-10-02" "2012-06-26" "2011-07-21" "2013-09-18"
## Max.
## "2014-03-12"
summary(prosperLoans$LoanOriginalAmount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1000 4000 6500 8337 12000 35000
# Plot by day
num_loans_by_day_graph <- ggplot(prosperLoans, aes(LoanOriginationDate)) +
geom_histogram(binwidth = 1) +
labs(title = "Daily Loan Counts")
# Change bin width to 30-days to minimize daily spikes and troughs
loan_amount_by_day_graph <- ggplot(prosperLoans, aes(LoanOriginationDate)) +
geom_histogram(binwidth = 30) +
labs(title = "Quarterly Loan Counts")
grid.arrange(num_loans_by_day_graph, loan_amount_by_day_graph, ncol=1)
Some observations about this data:
https://en.wikipedia.org/wiki/Prosper_Marketplace#Cease_and_desist_order
On November 24, 2008, the SEC found Prosper to be in violation of the Securities Act of 1933. As a result of these findings, the SEC imposed a cease and desist order on Prosper. Due primarily to the novel nature of the peer-to-peer lending models, the SEC, after review, now treats all peer-to-peer lending transactions as sales of securities and requires that all platforms register with the SEC.
Ok this explains the gap.
ggplot(aes(x = LoanOriginationDate, y = LoanOriginalAmount), data=prosperLoans) +
geom_line(stat = 'summary', fun.y = "mean") +
labs(title = "Mean loan amount over time") +
geom_smooth()
Both the number of loans and loan amounts (aside from the interruption in 2008) have increased How about total loaned amounts?
ggplot(aes(x = LoanOriginationDate, y = LoanOriginalAmount), data=prosperLoans) +
geom_line(stat = 'summary', fun.y = "sum") +
labs(title = "Total loan amount over time") +
geom_smooth()
The daily amounts have peaks and valleys that make the plots tough to look at. Using the quarterly date column, let’s see how data changes across quarters instead of each day:
loan_quarter_groups <- group_by(prosperLoans, LoanOriginationQuarter)
prosper_loans_by_quarter <- summarise(loan_quarter_groups, total_loan_amount = sum(LoanOriginalAmount), mean_loan_amount = mean(LoanOriginalAmount), count = n(), count_default = sum(UdacityInDefault))
ggplot(aes(x = LoanOriginationQuarter, y = LoanOriginalAmount), data=prosperLoans) + geom_bar(stat = 'identity') +
labs(title = "Total loan amount over time")
Total amount loaned each day has increased dramatically since day 1 of the Prosper service. The last column shows a slight decrease but this is likely due to an incomplete quarter. The last day for loan data was March 12, 2014 with 19 days left in the quarter.
# Stacked barchart for loan status
ggplot(prosperLoans, aes(LoanOriginationDate, fill=LoanStatus)) +
geom_bar() +
labs(title = "Loan Status Breakdown (by Date)")
## Warning: position_stack requires non-overlapping x intervals
So many (12!) statuses! It is difficult to read so many different statuses on a stacked bar chart. Let’s redraw this using a new “Visual Status” field that summarizes status into positive (current, paid off, prepaid) and negative (late payments, loan default) buckets.
ggplot(prosperLoans, aes(x = LoanOriginationDate, fill = UdacityLoanStatusVisual)) +
geom_histogram() +
labs(title = "Loan Status Breakdown (by Quarter)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Defaults as a percentage of all loans seem to be decreasing. This is a good sign.
One point of interest is that there are many prepaid (Status is complete, Close Date < Loan Maturity Date) loans in Prosper where borrowers paid off the loan ahead of schedule. How many of the completed loans are prepaid versus paid on schedule?
# It seems many of these loans are prepaid. What is the comparison between loans completed at maturity versus those which are prepaid?
ggplot(prosperLoans[prosperLoans$LoanStatus == "Completed",], aes(x = LoanOriginationDate, fill = UdacityLoanStatusVisual)) +
geom_histogram() +
labs(title = "Loan Status Breakdown (by OriginationDate)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Most of the completed loans, even dating back to 2008, are prepaid. This is troubling for Prosper investors (loan buyers) who may expect a steady stream of payments up until the maturity of the loan.
Loans may be prepaid for a variety of reasons including reduction in interest rates, better loan terms offered elsewhere, selling personal assets to pay debt, or general improvement in personal wealth. Without more information, it is hard to identify why Prosper borrowers are prepaying their loans in such high numbers.
# What are the common sizes for loans?
ggplot(aes(x = LoanOriginalAmount), data=prosperLoans) + geom_histogram(binwidth = 1000) +
scale_x_continuous(breaks = seq(0, 30000, 5000))
summary(prosperLoans$LoanOriginalAmount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1000 4000 6500 8337 12000 35000
Many loans are quite small (less than US$10,000). The largest loan offered by Prosper is $35,000 (confirmed on their website).
Many of the loans are in round units of $5,000. The peaks in this histogram are at $5k, $10k, $15k, $20k, and $25k. $35k loans are new (see chart in the final Plots section) so the number of these loans is still quite small.
ggplot(prosperLoans, aes(ListingCategory..numeric.)) +
geom_bar() +
labs(title = "# Loans broken down by listing category")
summary(prosperLoans$ListingCategory..numeric.)
## 0 1 2 3 4 5 6 7 8 9 10 11
## 16965 58308 7433 7189 2395 756 2572 10494 199 85 91 217
## 12 13 14 15 16 17 18 19 20
## 59 1996 876 1522 304 52 885 768 771
Debt consolidation (1) is by far the most popular type of loan. The 2nd and 3rd most popular listing categories are “Not Available” (0) and “Other” (7) which is not very descriptive. Home Improvement (2) and Business (3) loans round out the top 5 categories.
How about borrowers and their credit? What kinds of borrowers are more common in the Prosper Loans marketplace?
Prosper changed their credit rating field in 2009 from “CreditGrade” to “ProsperRating”. In order to effectively plot data across all dates, I have merged the two credit rating fields into a single “UdacityRating” field.
ggplot(prosperLoans, aes(x=UdacityRating)) +
geom_bar() +
labs(title = "Borrower Credit Quality for Prosper Loans")
There is a normal distribution of credit ratings with most loans occurring at medium credit quality.
# Interest rates distribution of loans
summary(prosperLoans$BorrowerAPR)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00653 0.15630 0.20980 0.21880 0.28380 0.51230 25
ggplot(prosperLoans, aes(x=BorrowerAPR)) +
geom_histogram(binwidth = 0.05) +
labs(title = "Borrower APR for Prosper Loans (BinWidth = 0.05)")
## Warning: Removed 25 rows containing non-finite values (stat_bin).
Interest rates range from 0.6% to 51.2%! Mean interest rate is 21.9%.
51.2%? This seems exceptionally high. What kinds of borrowers have to pay interest of more than 50%?
prosperLoans[!is.na(prosperLoans$BorrowerAPR) & prosperLoans$BorrowerAPR > 0.5,][c("LoanOriginationDate","ClosedDate","CreditGrade","IsBorrowerHomeowner","BorrowerAPR","LoanOriginalAmount","LoanStatus","StatedMonthlyIncome")]
## LoanOriginationDate ClosedDate CreditGrade IsBorrowerHomeowner
## 36019 2006-02-27 2006-03-14 HR False
## 56762 2006-03-21 2006-09-19 HR False
## BorrowerAPR LoanOriginalAmount LoanStatus StatedMonthlyIncome
## 36019 0.51229 1500 Completed 2838
## 56762 0.50633 3000 Defaulted 0
The borrowers of these 2 loans had exceptionally bad (or no) credit. The credit grade of “HR” refers to “Highest Risk” which is the lowest credit grade. Either there was no credit history or the borrower has a history of defaults.
(Sourced from: http://www.liquisearch.com/prosper_marketplace/evaluation_of_credit_risk/credit_grades )
Additionally, the borrowers were not homeowners and had low stated monthly incomes ($2,838 and $0 respectively). The high risk of lending to these individuals likely led to the high interest rate. The first borrower paid back the loan within a month. The second borrower defaulted within 6 months.
With a large binwidth (5%), the histogram looks similar to a normal distribution. However, if we reduce the BorrowerRate APR binwidth to 1%, the graph shows more widely distributed interest rates.
ggplot(prosperLoans, aes(x=BorrowerAPR)) +
geom_histogram(binwidth = 0.01) +
labs(title = "Borrower APR for Prosper Loans (BinWidth = 0.01)") +
scale_x_continuous(breaks = seq(0.00,.5,0.05), labels = c('0%','5%','10%','15%','20%','25%','30%','35%','40%','45%','50%'))
## Warning: Removed 25 rows containing non-finite values (stat_bin).
paste("# of Loans with Borrower APR between 35% and 36%: ", nrow(prosperLoans[prosperLoans$BorrowerAPR >= 0.35 & prosperLoans$BorrowerAPR < 0.36,]))
## [1] "# of Loans with Borrower APR between 35% and 36%: 7693"
Here, the distribution appears wider with multiple peaks including more than 7000 loans between 35 and 36%.
ggplot(prosperLoans[prosperLoans$UdacityInDefault == 1,], aes(UdacityDefaultPrincipalUnpaidPercent)) +
geom_histogram(binwidth = 0.1) +
labs(title = "Loss %")
Most defaults happen with >75% of the principal still left unpaid.
ggplot(aes(x = getNumberFromFactor(Term)), data=prosperLoans) + geom_histogram() +
scale_x_continuous(breaks = c(12,36,60))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
36-months is the most popular term length for Prosper Laons. 60-months is the next most popular term.
12-month loans do not appear to be popular. Is this term still offered?
paste("Last 12-month loan was offered in: ", max(prosperLoans[prosperLoans$Term == 12,]$LoanOriginationDate))
## [1] "Last 12-month loan was offered in: 2013-04-15"
paste("Last loan in the system was offered: ", max(prosperLoans$LoanOriginationDate))
## [1] "Last loan in the system was offered: 2014-03-12"
The last 12-month loan was 11-months before the last loan in this dataset so this term may no longer be offered.
“Charged-off” and “Defaulted” loans technically mean the same thing: The borrower was unable to pay back the loan. Let’s compare the total number of defaulted loans to the number of ‘completed’ loans using nrow().
paste("# of Closed loans: ",nrow(prosperLoans[!is.na(prosperLoans$ClosedDate),]))
## [1] "# of Closed loans: 55089"
paste("# of Completed (paid-off) loans: ",nrow(prosperLoans[prosperLoans$LoanStatus == 'Completed',]))
## [1] "# of Completed (paid-off) loans: 38074"
paste("# of Unpaid / Defaulted loans: ", nrow(prosperLoans[prosperLoans$UdacityInDefault == 1,]))
## [1] "# of Unpaid / Defaulted loans: 17010"
ggplot(prosperLoans[!is.na(prosperLoans$ClosedDate),], aes(UdacityInDefault)) +
geom_bar() +
labs(title = "# of Completed and Defaulted Loans") +
scale_x_continuous(breaks = c(0,1), labels = c("Completed", "In Default"))
Almost 1/3 of the closed loans are in default! This does not sound good. However, there are also a large number of open loans in the system which are not counted here.
What are the prosper scores for these defaulted loans?
prosperDefaultedLoans$ProsperScoreAsNumber <- getNumberFromFactor(prosperDefaultedLoans$ProsperScore)
summary(prosperDefaultedLoans$ProsperScoreAsNumber)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 4.000 5.000 5.427 7.000 11.000 10669
ggplot(prosperDefaultedLoans, aes(ProsperScoreAsNumber)) +
geom_histogram() +
labs(title = "Loan defaults by Prosper Score", x = "ProsperScore", y = "Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 10669 rows containing non-finite values (stat_bin).
Prosper scores for these defaults follow a normal distribution where the mean score is 5.4 right down the middle. (Highest prosper score possible is 11)
For defaults, what % of the loan principal remains unpaid?
ggplot(prosperDefaultedLoans, aes(UdacityDefaultPrincipalUnpaidPercent)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(prosperDefaultedLoans$UdacityDefaultPrincipalUnpaidPercent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.01884 0.61330 0.81160 0.73710 0.92300 1.00000
The histogram shows that most loans default with most of the principal still unpaid. The average default loan has 75% of the principle outstanding. This is not a good sign. What is the distribution of prosper scores where over 75% of the principal remains outstanding? If Prosper’s scoring algorithms are predicting losses correctly, we should see lower scores in this distribution.
ggplot(prosperDefaultedLoans[!is.na(prosperDefaultedLoans$ProsperScore) & prosperDefaultedLoans$UdacityDefaultPrincipalUnpaidPercent >= 0.75,], aes(ProsperScore)) +
geom_bar() +
labs(title = "Large Loan Defaults by Prosper Score", x = "ProsperScore", y = "Count")
summary(prosperDefaultedLoans[!is.na(prosperDefaultedLoans$ProsperScore) & prosperDefaultedLoans$UdacityDefaultPrincipalUnpaidPercent >= 0.75,]$ProsperScoreAsNumber)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 5.000 5.212 7.000 11.000
The scores here follow a similar distribution with the normal distribution centered at 5.2 instead of 5.4. Since Prosper Score combines borrower credit rating with historical Prosper data to create an improved ‘risk score’, one would think this field would better predict large defaults. Has this scoring system become any better at predicting large losses since it was introduced in 2009? Let’s compare pre- and post-2011 loan scores and see.
ggplot(prosperLoans[prosperLoans$UdacityInDefault == 1 & prosperLoans$UdacityDefaultPrincipalUnpaidPercent >= 0.75 & prosperLoans$LoanOriginationDate < '2013-01-01',], aes(getNumberFromFactor(ProsperScore))) +
geom_histogram() +
labs(title = "Prosper Scores for Pre-2013 Large Loan Defaults", x = "ProsperScore", y = "Count") +
scale_x_continuous(breaks = seq(1,11,1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5560 rows containing non-finite values (stat_bin).
ggplot(prosperLoans[prosperLoans$UdacityInDefault == 1 & prosperLoans$UdacityDefaultPrincipalUnpaidPercent >= 0.75 & prosperLoans$LoanOriginationDate > '2013-01-01',], aes(getNumberFromFactor(ProsperScore))) +
geom_histogram() +
labs(title = "Prosper Scores for Post-2013 Large Loan Defaults", x = "ProsperScore", y = "Count") +
scale_x_continuous(breaks = seq(1,11,1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
print("Average time (# days) to default (for defaulted loans)")
## [1] "Average time (# days) to default (for defaulted loans)"
mean(as.numeric(prosperDefaultedLoans$ClosedDate - prosperDefaultedLoans$LoanOriginationDate))
## [1] 510.8509
The distributions are very different. Instead of a normal distribution of prosper scores, the post-2013 scores shows a downward trending pattern where higher scores (better loans) experience fewer defaults. This is ideal behavior and indicates that Prosper Score may be improving at predicting default risk.
However, since post-2013 loans are relatively new and the average loan default occurs after 510 days, perhaps this distribution reflects poor quality borrowers who default early in the loan term. As loans mature, many more could default which would affect the above histogram. My personal belief is that when analyzing defaults and probability of defaults, examining defaults on loans issued after October 2012 (511 days before this file was created) are not worth investigating in this particular dataset.
This dataset includes 113,937 loans issued between November 15, 2005 and March 12, 2014. These loans were for amounts of between $1,000 and $35,000 with a mean (average) loan amount of $8,337. There is a gap in dates between October 18, 2008 and May 5, 2009. This gap represents period when the Securities Exchange Commission (SEC) ordered Prosper Marketplace (and other peer-to-peer lenders) to temporarily cease business activities pending regulatory review. This event is described in the Wikipedia entry for Prosper Marketplace:
https://en.wikipedia.org/wiki/Prosper_Marketplace
Each record (row) in the dataset includes different kinds of information.
Borrower Details:
Loan Details
Loan Status
Borrower credit quality is measured in different ways depending on the date of the loan. Key components are borrower credit scores (available for all loans except a few of the oldest loans), existing/open lines of credit, income, and home ownership. These attributes are combined into a single credit quality field which changed midway through the time this data was collected.
Pre-2009:
CreditGrade - Experian ScoreEx rating.From lowest to highest: NC (No Credit), HR (High Risk), E, D, C, B, A, AA
Post-2009:
ProsperRating (Numeric) - From lowest to highest: 1, 2, 3, 4, 5, 6, 7
The old scoring system has 8 levels where the new system has 7. From reviewing web sites related to Prosper Rating, I believe that “NC” (No Credit) and “HR” (High Risk) from the old-style “CreditGrade” field have been combined into a single rating (“HR”) in the new ProsperRating.
References:
http://www.orchardplatform.com/blog/alphabetic-ratings-and-numeric-scores-on-prosper/ http://www.lendacademy.com/prosper-review/
Borrower credit quality is an input to the Prosper Score which appears to be the key variable determining interest rate in today’s Prosper Marketplace loans.
The main features of interest to me are:
Yes, I created the following variables:
I combined two fields (CreditGrade and ProsperRating..numeric.) into a single field: UdacityRating. This will make plots possible across all the data instead of breaking things up into pre-2009 (CreditGrade) and post-2009 (ProsperRating) charts.
I also created a parallel “Visual Status” field (see above) so that I would not have to work with the granular statuses used in this dataset.
I also changed the data types of a few fields to fix parsing issues after loading the CSV file.
Date fields originally got parsed from the CSV file as factors so I had to change them to date fields. This is useful for running calculations against dates (date differences) and ordering dates properly.
Also, listing categories, terms, and rating scores were originally parsed as character data or numeric data. I changed these to factor fields.
I did not notice highly unusual distributions although there was a wide range of Borrower APR interest rates across the data (0.6% - 50%).
The key driver for BorrowerAPR appears to be borrower credit rating (UdacityRating). Which columns correlate strongest to this credit rating field?
library("GGally")
theme_set(theme_minimal(20))
summary(prosperLoans[prosperLoans$LoanOriginationDate > '2009-01-01',]$ProsperScore)
## 1 2 3 4 5 6 7 8 9 10 11 NA's
## 992 5766 7642 12595 9813 12278 10597 12053 6911 4750 1456 144
ggpairs(prosperLoans, columns = c('UdacityRatingAsNumber','CreditScoreRangeUpper','CreditScoreRangeLower','CurrentDelinquencies','CurrentCreditLines','EmploymentStatusDuration','AmountDelinquent','StatedMonthlyIncome'))
## Warning in fn(x$data, plotObj$mapping): Removed 591 rows containing missing
## values
## Warning in fn(x$data, plotObj$mapping): Removed 591 rows containing missing
## values
## Warning in fn(x$data, plotObj$mapping): Removed 697 rows containing missing
## values
## Warning in fn(x$data, plotObj$mapping): Removed 7604 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 7625 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 7622 rows containing
## missing values
## Warning: Removed 591 rows containing missing values (geom_point).
## Warning: Removed 591 rows containing non-finite values (stat_density).
## Warning in fn(x$data, plotObj$mapping): Removed 591 rows containing missing
## values
## Warning in fn(x$data, plotObj$mapping): Removed 697 rows containing missing
## values
## Warning in fn(x$data, plotObj$mapping): Removed 7604 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 7625 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 7622 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 591 rows containing missing
## values
## Warning: Removed 591 rows containing missing values (geom_point).
## Warning: Removed 591 rows containing missing values (geom_point).
## Warning: Removed 591 rows containing non-finite values (stat_density).
## Warning in fn(x$data, plotObj$mapping): Removed 697 rows containing missing
## values
## Warning in fn(x$data, plotObj$mapping): Removed 7604 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 7625 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 7622 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 591 rows containing missing
## values
## Warning: Removed 697 rows containing missing values (geom_point).
## Warning: Removed 697 rows containing missing values (geom_point).
## Warning: Removed 697 rows containing missing values (geom_point).
## Warning: Removed 697 rows containing non-finite values (stat_density).
## Warning in fn(x$data, plotObj$mapping): Removed 7624 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 7645 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 7622 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 697 rows containing missing
## values
## Warning: Removed 7604 rows containing missing values (geom_point).
## Warning: Removed 7604 rows containing missing values (geom_point).
## Warning: Removed 7604 rows containing missing values (geom_point).
## Warning: Removed 7624 rows containing missing values (geom_point).
## Warning: Removed 7604 rows containing non-finite values (stat_density).
## Warning in fn(x$data, plotObj$mapping): Removed 7627 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 7624 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 7604 rows containing
## missing values
## Warning: Removed 7625 rows containing missing values (geom_point).
## Warning: Removed 7625 rows containing missing values (geom_point).
## Warning: Removed 7625 rows containing missing values (geom_point).
## Warning: Removed 7645 rows containing missing values (geom_point).
## Warning: Removed 7627 rows containing missing values (geom_point).
## Warning: Removed 7625 rows containing non-finite values (stat_density).
## Warning in fn(x$data, plotObj$mapping): Removed 7645 rows containing
## missing values
## Warning in fn(x$data, plotObj$mapping): Removed 7625 rows containing
## missing values
## Warning: Removed 7622 rows containing missing values (geom_point).
## Warning: Removed 7622 rows containing missing values (geom_point).
## Warning: Removed 7622 rows containing missing values (geom_point).
## Warning: Removed 7622 rows containing missing values (geom_point).
## Warning: Removed 7624 rows containing missing values (geom_point).
## Warning: Removed 7645 rows containing missing values (geom_point).
## Warning: Removed 7622 rows containing non-finite values (stat_density).
## Warning in fn(x$data, plotObj$mapping): Removed 7622 rows containing
## missing values
## Warning: Removed 591 rows containing missing values (geom_point).
## Warning: Removed 591 rows containing missing values (geom_point).
## Warning: Removed 697 rows containing missing values (geom_point).
## Warning: Removed 7604 rows containing missing values (geom_point).
## Warning: Removed 7625 rows containing missing values (geom_point).
## Warning: Removed 7622 rows containing missing values (geom_point).
Credit score appears to have the highest correlation to borrower credit rating. How does this look on a chart?
# Credit score and Prosper rating are STRONGLY correlated
ggplot(prosperLoans, aes(x = CreditScoreRangeUpper, y = getNumberFromFactor(UdacityRating))) +
geom_line(stat = 'summary', fun.y = mean) +
labs(title = "Credit Score vs. Prosper Rating", x = "Credit Score", y = "Credit Rating")
## Warning: Removed 591 rows containing non-finite values (stat_summary).
Aside from a relatively flat line at lower credit score levels, the correlation beteen credit score and credit rating appears quite high. How many Prosper borrowers have a credit score below 500?
cat("# of Loans (rows):", nrow(prosperLoans))
## # of Loans (rows): 113937
cat("Minimum credit score:", min(prosperLoans[!is.na(prosperLoans$CreditScoreRangeUpper),]$CreditScoreRangeUpper))
## Minimum credit score: 19
# 19??? That is really low...
cat("# of Loans with <500 credit score borrowers:", nrow(prosperLoans[prosperLoans$CreditScoreRangeUpper < 500,]))
## # of Loans with <500 credit score borrowers: 1253
cat("# of loans with 19 credit score borrowers:", nrow(prosperLoans[prosperLoans$CreditScoreRangeUpper == 19,]))
## # of loans with 19 credit score borrowers: 724
Many of these loans are associated with borrowers with a credit score of 19! I did not think this was possible!
Around 1% of the borrowers have credit scores less than 500. These loans appear quite rare which may explain why these loans are grouped in the same prosper rating category as typical lower credit scores (500 - 600). If we remove these outliers, does the correlation get stronger?
ggplot(prosperLoans[prosperLoans$CreditScoreRangeUpper >= 500,], aes(x = CreditScoreRangeUpper, y = getNumberFromFactor(UdacityRating))) +
geom_line(stat = 'summary', fun.y = mean) +
labs(title = "Credit Score vs. Prosper Rating", x = "Credit Score", y = "Credit Rating")
## Warning: Removed 591 rows containing non-finite values (stat_summary).
Yes! When low credit scores (possibly outliers) are removed, credit score becomes a key driver in Udacity’s credit rating system!
Are the number of borrower delinquencies correlated with the borrower’s credit rating?
#Prosper Rating somewhat related to delinquencies... but check out the data point at 51 delinquencies!
# Prosper Rating is 5 for this despite having 51 delinquencies. Perhaps this is due to the higher credit score.
ggplot(prosperLoans, aes(x = CurrentDelinquencies, y = getNumberFromFactor(UdacityRating))) +
geom_line(stat = 'summary', fun.y = mean) +
labs(title = "# Current Delinquencies vs. Prosper Rating", x = "# Current Delinquencies", y = "Prosper Credit Rating")
## Warning: Removed 697 rows containing non-finite values (stat_summary).
Delinquencies and credit rating are negatively correlated which is expected. However at 50 delinquencies, there is a huge uptick in Prosper Rating! Is this a significant data point or an outlier?
Let’s query loans with >40 delinquencies and see if this is an isolated or common case?
prosperLoans[!is.na(prosperLoans$CurrentDelinquencies) & prosperLoans$CurrentDelinquencies >= 40 & prosperLoans$CurrentDelinquencies < 60,][c('CurrentDelinquencies','UdacityRating','LoanOriginationDate','LoanOriginalAmount','LoanStatus','BorrowerAPR','Term','CreditScoreRangeLower','ClosedDate','StatedMonthlyIncome')]
## CurrentDelinquencies UdacityRating LoanOriginationDate
## 11902 41 1 2008-06-11
## 35285 59 1 2007-01-23
## 40760 50 1 2007-04-11
## 44477 57 1 2006-05-18
## 62462 45 1 2007-07-30
## 83363 41 1 2006-10-06
## 105566 40 1 2008-03-10
## 109209 51 5 2013-12-27
## LoanOriginalAmount LoanStatus BorrowerAPR Term
## 11902 1000 Chargedoff 0.37453 36
## 35285 1550 Defaulted 0.25723 36
## 40760 1000 Completed 0.27926 36
## 44477 2000 Chargedoff 0.24691 36
## 62462 1000 Chargedoff 0.19986 36
## 83363 4000 Defaulted 0.29776 36
## 105566 1000 Chargedoff 0.37453 36
## 109209 15000 Current 0.17611 36
## CreditScoreRangeLower ClosedDate StatedMonthlyIncome
## 11902 520 2009-03-13 1665.667
## 35285 480 2007-06-24 1969.000
## 40760 520 2010-04-11 2666.667
## 44477 480 2008-03-18 6100.000
## 62462 540 2010-11-29 2500.000
## 83363 500 2007-05-07 4166.667
## 105566 520 2010-07-09 600.000
## 109209 700 <NA> 5000.000
There is one loan in the result set with a Prosper rating of ‘5’. Despite the high (50) delinquencies, he/she has a relatively high credit score (700) compared to other borrowers. In Udacity’s rating algorithm, perhaps credit score is a larger factor than the number of delinquencies. In any case, this data point is an outlier.
Does the number of current credit lines affect credit rating?
ggplot(prosperLoans, aes(x = CurrentCreditLines, y = UdacityRatingAsNumber)) +
geom_line(stat = 'summary', fun.y = mean) +
labs(title = "Current Credit Lines vs. Prosper Rating", x = "Current Credit Lines", y = "Prosper Credit Rating")
## Warning: Removed 7604 rows containing non-finite values (stat_summary).
At the lower range (0 - 10), # of credit lines appears to affect credit rating. However for borrowers with more than 10 credit lines, there does not seem to be any increase in credit rating.
How about employment duration in months?
summary(prosperLoans$EmploymentStatusDuration)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 26.00 67.00 96.07 137.00 755.00 7625
ggplot(prosperLoans, aes(y = UdacityRatingAsNumber, x = EmploymentStatusDuration)) +
geom_line(stat = 'summary', fun.y = mean) +
labs(title = "Employment Duration vs. Prosper Rating", x = "Employment Duration (months)", y = "Prosper Credit Rating")
## Warning: Removed 7625 rows containing non-finite values (stat_summary).
There does not appear to be much of a correlation here. However an interesting thing happens when we switch the X and Y axis. If we plot Prosper Rating (x) against Employment Duration in months, we see a pattern.
ggplot(prosperLoans, aes(y = EmploymentStatusDuration, x = UdacityRatingAsNumber)) +
geom_line(stat = 'summary', fun.y = mean) +
labs(title = "Prosper Rating vs. Employment Duration", x = "Credit Rating", y = "Employment Duration (months)")
## Warning: Removed 7625 rows containing non-finite values (stat_summary).
On average, for lower rated borrowers there is correlation between # of months employed and Udacity credit rating. However at around credit score 5, the correlation ends and the trend reverses. Borrowers with the highest scores tend to have been employed for fewer months than medium credit borrowers. Perhaps more successful people tend to switch companies to pursue better (higher paying?) opportunities which means less time employed at a single company.
Continuing from where we left off at the histograms in the univariate section, let’s look at the impact of borrower credit rating on Borrower APR.
ggplot(prosperLoans, aes(x=BorrowerAPR, fill = UdacityRating)) +
geom_histogram() +
labs(title = "Borrower APR and Credit Rating") +
guides(fill=guide_legend(title="Prosper Rating"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).
Generally, borrower credit rating and interest rate are negatively correlated. Higher rated borrowers receive lower interest rates. However credit rating alone does not determine interest rates as can be seen by ranges of interest rates for any particular credit grade / prosper rating.
Loan defaults occur when a borrower is unable to pay back their loan. These can be catastrophic to Prosper investors who count on an income stream from interest payments and their original investment to be returned at loan maturity.
Can Prosper predict loan defaults?
Prosper uses borrow credit reports to predict default rates (and assign appropriate interest rates). How well do credit scores predict loan defaults?
ggplot(aes(x = CreditScoreRangeUpper, y = UdacityInDefault), data = prosperLoans) +
geom_line(stat = 'summary', fun.y = 'mean') +
scale_x_continuous(breaks = seq(0,800,100)) +
labs(title = "Credit Score vs. Default Rate", x = "Credit Score", y = "Default Rate (%)")
## Warning: Removed 591 rows containing non-finite values (stat_summary).
Similar to the above charts, there is some noise for loans with credit scores less than 500. Let’s remove these outliers and redraw the graph:
ggplot(aes(x = CreditScoreRangeUpper, y = UdacityInDefault), data = prosperLoans[prosperLoans$CreditScoreRangeUpper >= 500,]) +
geom_line(stat = 'summary', fun.y = 'mean') +
scale_x_continuous(breaks = seq(0,800,100)) +
labs(title = "Credit Score vs. Default Rate", x = "Credit Score", y = "Default Rate (%)")
## Warning: Removed 591 rows containing non-finite values (stat_summary).
This looks ‘better’. As the borrower credit scores increase, loan default % decreases. After a steep decrease until ~680, the default percent stays relatively flat.
Looking at ‘default %’ using the entire Prosper loans dataset is problematic in that both matured loans and new/open loans are included in the data set. If we are trying to calculate a “default percent”, it is not quite ‘correct’ to include loans that have only been issued for a few months. A better approach would be to take all loans on or before a ‘cutoff date’ and evaluate default percentages based on this reduced dataset.
To identify a cutoff date, let’s check the most recent loan date and the average # of days that a default occurs based on defaulted loans’ origination dates and close dates.
maxDate = max(prosperLoans$LoanOriginationDate)
cat("Most recent loan was originated in: ", maxDate)
## Most recent loan was originated in: 16141
cat("Average time (# days) to default (for defaulted loans)", mean(as.numeric(prosperDefaultedLoans$ClosedDate - prosperDefaultedLoans$LoanOriginationDate)))
## Average time (# days) to default (for defaulted loans) 510.8509
cutoffDate = maxDate - 511
cat("Using mean days to default, what is a 'safe' day before which to look at defaulted loans? ", as.character(cutoffDate))
## Using mean days to default, what is a 'safe' day before which to look at defaulted loans? 2012-10-17
Using 2012-10-17 as a cutoff date, let’s draw the graph again for >500 credit score loans:
preCutoffLoans <- prosperLoans[prosperLoans$LoanOriginationDate <= cutoffDate,]
ggplot(aes(x = CreditScoreRangeUpper, y = UdacityInDefault), data = preCutoffLoans[preCutoffLoans$CreditScoreRangeUpper >= 500,]) +
geom_line(stat = 'summary', fun.y = 'mean') +
scale_x_continuous(breaks = seq(0,800,100)) +
labs(title = "Credit Score vs. Default Rate", x = "Credit Score", y = "Default Rate (%)")
## Warning: Removed 591 rows containing non-finite values (stat_summary).
This chart shows a smoother decrease in default percentages for low-middle credit scores. The previous chart showed a sharp decrease in defaults which was likely due to newer (still current) loans being included in the dataset. During the term of these loans, many more will default if previous patterns are consistent. We are still underestimating default rates towards the end of our limited data, but hopefully we capture many of the defaults while using as much of the data as possible.
How well does the 7-point Prosper Rating score (which takes into account other factors like employment status and income) predict defaults?
ggplot(aes(x = UdacityRatingAsNumber, y = UdacityInDefault), data = preCutoffLoans) +
geom_line(stat = 'summary', fun.y = 'mean') +
labs(title = "Prosper Rating vs. Default Rate", x = "Prosper Rating", y = "Default Rate (%)")
Higher Prosper Rating also correlated to lower defaults. However the credit scores provide a much more granular predicting factor for default %s. With only 7 possible Prosper Rating scores, many different borrowers will get grouped into the same score which is why the lowest rated borrows still default less than 40% of the time (as opposed to the 60+% default rate for the lowest credit scores).
Does the EstimatedLoss field correlate with more defaults?
ggplot(aes(x = EstimatedLoss, y = UdacityInDefault), data = preCutoffLoans) +
geom_line(stat = 'summary', fun.y = 'mean') +
labs(title = "Estimated Loss vs. Default Rate", x = "Estimated Loss", y = "Default Rate (%)")
## Warning: Removed 29084 rows containing non-finite values (stat_summary).
The line graph has a lot of noise. Since EstimatedLoss is a very precise field (4-digits precision), the data is quite scattered. Let’s try rounding to 2-digits (creating buckets) and see if the correlation appears stronger.
ggplot(aes(x = EstimatedLossRounded2, y = UdacityInDefault), data = preCutoffLoans) +
geom_line(stat = 'summary', fun.y = 'mean') +
geom_smooth() +
labs(title = "Estimated Loss (Rounded) vs. Default Rate", x = "Estimated Loss (Rounded)", y = "Default Rate (%)")
## Warning: Removed 29084 rows containing non-finite values (stat_summary).
## Warning: Removed 29084 rows containing non-finite values (stat_smooth).
This second chart shows a stronger correlation between higher EstimatedLoss predictions and number of actual defaults. Since estimated loss is likely at least partially derived from a borrower’s credit rating, this is not a surprise.
Unlike the previous graphs, here we can see how closely defaults are matching Prosper’s predictions. At first glance, it appears that Prosper’s predictions are proving too conservative. At 0.1% estimated losses, roughly 0.2% of the loans are defaulting. However, estimated loss is a prediction of lost PRINCIPAL, not defaults, so let us run a similar chart with estimated loss plotted against actual principal loss.
# Add a slope 1 line
ggplot(aes(x = EstimatedLossRounded2, y = UdacityDefaultPrincipalUnpaidPercent), data = preCutoffLoans) +
geom_line(stat = 'summary', fun.y = 'mean') +
geom_smooth() +
geom_abline(intercept = 0, slope = 1, linetype = 2) +
labs(title = "Estimated Loss vs. Unpaid Principal", x = "Estimated Loss (%)", y = "Unpaid Principal (%)")
## Warning: Removed 29084 rows containing non-finite values (stat_summary).
## Warning: Removed 29084 rows containing non-finite values (stat_smooth).
For loans where Estimated Losses are <0.2%, actual losses exceed Prosper’s estimates. However, above 0.2%, Prosper’s estimates are higher than actual losses. How does Prosper do overall on predicting daily losses versus the actual principal losses?
ggplot(prosperLoans[!is.na(prosperLoans$EstimatedEffectiveYield),], aes(x = LoanOriginationQuarter)) +
geom_line(aes(y=UdacityEstimatedLossAmount, group = 'Estimated Losses', color = "Estimated Losses"), stat = 'summary', fun.y = 'sum') +
geom_line(aes(y=LP_NetPrincipalLoss, group = 'Estimated Losses', color = "Actual Losses"), stat = 'summary', fun.y = 'sum') +
scale_y_continuous(breaks = seq(1000000,10000000,1000000), labels = c("1M","2M","3M","4M","5M","6M","7M","8M","9M","10M")) +
labs(title = "Principal Losses", x = "Loan Origination Quarter", y = "Loss Amount") +
guides(fill=guide_legend(title="Losses"))
This plot paints a different picture. Through the end of 2010, Prosper underestimated the actual losses by a small margin. In 2011 and 2012, the gap between estimated and actual losses increased to about $1M/quarter. This coincides with an increase in loan volumes so perhaps the difference is not so meaningful.
In late 2012, the actual losses appear to dive. However this is likely due to new loans that are still current but may default soon enough (see details above).
Staying on topic of defaults, does BorrowerAPR accurately predict risk of default? Let’s plot default rate against 2-decimal rounded APR loans:
#
ggplot(aes(x = BorrowerAPRRounded2, y = UdacityInDefault), data = preCutoffLoans) +
geom_line(stat = 'summary', fun.y = 'mean') +
geom_smooth()
## Warning: Removed 25 rows containing non-finite values (stat_summary).
## Warning: Removed 25 rows containing non-finite values (stat_smooth).
Interesting! Although default rates are higher for extremely high (>40%) interest rates, a larger section of intreset rates (15% - 40%) shows a flat default rate where a third of the loans default.
From my analysis, BorrowerAPR is determined primarily by a borrower’s credit score (and Prosper Rating). Other factors affecting APR possibly include defaults and employment duration. This is difficult to determine by the provided data since credit score itself likely takes into account these factors. The Prosper Rating by itself is enough to estimate a pretty narrow range for APR.
Comparing defaults to paid-off loans, it appears that on average the defaulted loans are paying higher interest rates on their loans.
Default rates correlate negatively with Prosper Rating. Low rated loans default close to 40% of the time whereas top-rated (Prosper Rating = 7) loans default less than 15% of the time.
Credit score on its own also correlates negatively with default rates. Credit scores below 600 default more than 50% of the time. High (>850) credit scores default less than 5% of the time. Since credit score is a lot more granular, this value provides much more detail than the 7-category Prosper Rating score.
Estimated Loss also is also correlated with default rate but not as strongly as Credit Rating or credit scores.
Prosper Rating better predicts default rates as opposed to Prosper Score. Estimated losses overall appear to underestimate the actual principal losses due to defaults. Prosper seems to be getting better at predicting defaults compared to when they first started.
The strongest relationships I found were twofold:
Interest rates change every day. So may the Prosper Rating. When we look at average APR across 2005 - 2014 loans, we miss the daily fluctuations in interest rate. If we track interest rates segregated by Prosper Rating, what can we learn?
# Date vs. Borrower APR (and Prosper Rating)
ggplot(prosperLoans, aes(x = LoanOriginationDate, y = BorrowerAPR)) +
geom_line(aes(color = UdacityRating), stat = 'summary', fun.y = mean) +
labs(title = "Date vs. Borrower APR", x = "Loan Origination Date", y = "Borrower APR") +
guides(fill=guide_legend(title="Prosper Rating"))
## Warning: Removed 25 rows containing non-finite values (stat_summary).
This graph is interesting in two ways: 1. Prosper Rating’s influence on interest rates is obvious. Unlike the histograms in the univariate plot, this time we see a narrow band of interest rates for each Prosper Rating. 2. As time goes on, the interest rates become clearly tiered and separated (by Prosper Rating) especially compared to pre-2009 interest rates.
Is interest rate on average higher for loans that default versus those that do not?
ggplot(preCutoffLoans, aes(x = LoanOriginationDate, y = BorrowerAPR)) +
geom_line(aes(color = UdacityInDefaultFactor), stat = 'summary', fun.y = mean) +
labs(title = "Date vs. Borrower APR", x = "Loan Origination Date", y = "Borrower APR")
## Warning: Removed 25 rows containing non-finite values (stat_summary).
On most days, loans that defaulted, on average, were assigned higher interest rates than those that did not. How about on a quarterly basis?
ggplot(preCutoffLoans, aes(x = LoanOriginationQuarter, y = BorrowerAPR)) +
geom_line(aes(color = UdacityInDefaultFactor, group = UdacityInDefaultFactor), stat = 'summary', fun.y = mean) +
labs(title = "Date vs. Borrower APR", x = "Loan Origination Quarter", y = "Borrower APR")
## Warning: Removed 25 rows containing non-finite values (stat_summary).
This graph is much easier to read than the daily view.
Except for Q2 2009, on a quarterly basis, interest rates appear around 5% higher for loans that eventually default versus loans that are paid back in full. 2009 was the year of the SEC shutdown and there were very few loans before the second half of the year. How many loans?
cat("# of Loans in Q2 2009:", nrow(prosperLoans[prosperLoans$LoanOriginationQuarter == 'Q2 2009',]))
## # of Loans in Q2 2009: 13
There were only 13 loans in this quarter. This is an extremely small dataset for a whole quarter therefore we can ignore this datapoint as an outlier.
Of loans issued daily, how many default for different borrowers across different credit (Prosper) ratings?
ggplot(preCutoffLoans, aes(x = LoanOriginationQuarter, y = UdacityInDefault)) +
geom_line(aes(color = UdacityRating, group = UdacityRating), stat = 'summary', fun.y = mean) +
labs(colour = "Prosper Rating")
There are some interesting findings here: 1. Generally higher ratings translate to fewer defaults. 2. In early quarters, low rated loans defaulted at extremely high rates (>60%)! This rate came down to below 40% in 2009 perhaps due to improved loan underwriting. 3. Default rates do overlap between the lowest rated (1-2) and medium rated (4-5) loans. However overall the ratings do tend to correctly predict relative default rates. 4. At the end of the graph, many of the lines do cross over. It will be interesting to see if the default rates change before the loans mature.
Does Prosper Score, which takes into account previous loan performance, provide a better or worse indicator of relative defaults?
ggplot(preCutoffLoans, aes(x = LoanOriginationQuarter, y = UdacityInDefault)) +
geom_line(aes(color = ProsperScore, group = ProsperScore), stat = 'summary', fun.y = mean)
This graph is incredibly hard ot look at. The default rates cross over each other frequently and it’s hard to identify the 11 colors on a single chart. Let’s try to break this up into higher scoring (>4) versus lower scoring (<=4) loans. First, let’s look at the lowest scoring loans:
ggplot(preCutoffLoans[getNumberFromFactor(preCutoffLoans$ProsperScore) <= 4, ], aes(x = LoanOriginationQuarter, y = UdacityInDefault)) +
geom_line(aes(color = ProsperScore, group = ProsperScore), stat = 'summary', fun.y = mean)
## Warning: Removed 29084 rows containing non-finite values (stat_summary).
Default rates vary wildly from quarter to quarter. Default rates swing from almost 45% to less than 30% in a single quarter for the lowest (1) scoring loans. In the 1st quarter of 2011, the 2nd lowest scoring tier loans did not default at all. This looks suspicious. How many loans exist in each Prosper Score bucket?
summary(preCutoffLoans$ProsperScore)
## 1 2 3 4 5 6 7 8 9 10 11 NA's
## 847 1333 1813 3436 4374 5504 4475 6916 4068 2053 0 29084
summary(prosperLoans[prosperLoans$LoanOriginationQuarter == 'Q1 2011',]$ProsperScore)
## 1 2 3 4 5 6 7 8 9 10 11
## 0 1 0 113 263 352 255 356 229 175 0
summary(prosperLoans[prosperLoans$LoanOriginationQuarter == 'Q2 2011',]$ProsperScore)
## 1 2 3 4 5 6 7 8 9 10 11
## 135 113 136 353 352 392 278 386 200 133 0
Interesting… in the first quarter of 2011, there was only 1 loan in the lowest 3 prosper scores. Then in the second quarter, this number increased to 350+ loans. Overall, for the lower scores, it does not look like a strong negative correlation between Prosper Score and default rates. Does this look different for the higher scores?
ggplot(preCutoffLoans[getNumberFromFactor(preCutoffLoans$ProsperScore) > 4, ], aes(x = LoanOriginationQuarter, y = UdacityInDefault)) +
geom_line(aes(color = ProsperScore, group = ProsperScore), stat = 'summary', fun.y = mean)
## Warning: Removed 29084 rows containing non-finite values (stat_summary).
This chart is more like what I was expecting. Loans which have been assigned the highest Prosper Score have very low (<5%) default rates THere is some overlap across the middle scores, but overall the trend is clear. Higher scores correlate with lower defaults.
Looking at a daily or quarterly snapshot of loans and interest rates (as opposed to all dates at once) reduces noise created by daily changes to interest rates, Prosper loan pricing algorithms, and improved risk detection procedures. There is less overlap of mean interest rates for borrowers of different prosper credit ratings.
Yes. I was surprised that borrower credit rating seems to be ‘better’ at predicting default rates than Prosper’s “Prosper Score” which is supposed to better approximate risk with additional analytics based on Prosper loan performance.
No I did not create a model for this dataset. I considered looking at a model for Borrower APR or Estimated Loss. However it is very difficult to identify the criteria used to determine either factor beyond the credit scores behind each loan.
We drew a histogram earlier showing loan counts and aggregate loan amounts on a daily basis for the entire dataset. The boxplot here provides addition detail that adds new information.
Looking at loan status across all dates (and across Borrower Rating) reveals some interesting facts:
Loan prepayments are everywhere at all levels. It seems that no prosper borrowers are holding their loans for the full term. As a Prosper investor, I would be very concerned about this. As a lender, I evaluate each loan and choose to invest after becoming comfortable with the risks and returns. Every time a loan is prepaid before the term is complete, I need to find another loan with similar criteria (and risk) to keep receiving my targeted return. Every investment is a risk and this makes investing in Prosper loans more risky and more work.
As witnessed in some of the above charts, lower rated borrowers defaulted at high rates early on. These default rates came down across all rating levels after the 2009 shutdown. Defaults are creeping up again at all borrower credit levels, but the increase is not as high as the total number of loans. Lower rated borrowers are defaulting at much higher rates than high rated borrowers. This is expected. However PRosper should keep an eye on the default rates and make sure they stay manageable.
## Warning: Removed 25 rows containing missing values (geom_point).
This is a scatterplot version of the Prosper Rate vs. Mean Borrower APR line graph that we plotted in the Multivariate section. The difference is hear we see all data points instead of just the mean interest rates. With the additional data comes some new findings.
I was excited to have the opportunity to look at real loan data from Prosper Market, a FinTech pioneer, who was one of the first US providers of peer-to-peer lending products.
This dataset is particularly interesting because the loan data contains three different types of information in each record:
Borrower details and loan details are setup when the loan is issued. This data is not updated afterwards. The loan status is updated during the life of the loan.
Using this information, I attempted to derive the “Default Ratio” of Prosper Loans and collect visualizations showing whether borrower credit rating or other information could predict higher or lower loan defaults. While focusing on older loans (more than 511 days old, reflecting the average life of a defaulted loan), I noticed a tight correlation between credit rating and loan default. This was noticeable with Prosper’s 7-tier rating system but was even more apparent when looking at borrower credit scores. Borrowers with the highest credit scores rarely defaulted on their loans while low credit score borrowers defaulted most of the time on their loans. Interest rates were also tied closely to credit scores which makes sense considering the relationship between credit score and loan defaults.
Initially I was looking forward to comparing Prosper’s Estimated Return against actual investor returns from loans in this dataset. Unfortunately I quickly ran into problems due to limited knowledge of this dataset. SOme of these problems were:
Perhaps the actual return on investment from Prosper loans would be more easily extracted using a different kind of data. Pulling data from an Accounts Payable system that pays out the loan investors would be a more accurate way to identify the investor returns and compare them to the originating loans (and estimated returns). As Prosper improves their algorithms to better identify potential risks due to defaults and prepayments, they can more effectively communicate these risks to investors who will be happier when this product (Prosper Loans) behaves according to expectations with fewer surprises.
rm(prosperLoans)
rm(defaultedLoans)
## Warning in rm(defaultedLoans): object 'defaultedLoans' not found
rm(preCutoffLoans)